library(dplyr)
library(colorRamps)
library(mapr)
library(raster)
library(sf)
library(tidyverse)
library(sdm)
library(knitr)
library(readr)
library(usdm)
library(ENMeval)
library(dismo)
library(tidyr)
require(ENMGadgets)
library(reshape2)
library(maptools)
library(corrplot)
library(beepr)
library(ntbox)
library(sqldf)
library(fields)
library(rgdal)
Human cases of dengue fever were gathered from the Brazil health ministry database (DATASUS in 2019 from http://www2.datasus.gov.br/). Only cases recorded between 2007 and 2019 were available. However, only cases between 2013-2016 had lab tests confirmation available and entered the model (mainly by serological tests). As municipality was the thinnest precision available, only one case by municipality entered the model.
a<-("C:/Users/bsmah/OneDrive/Área de Trabalho/novas_bases_de_dados_novas_doencas")
setwd(a)
dng<-readr::read_csv("occ_dengue_clean_lim_oppc2.csv")
dng$species<-1
sum(dng$species)
## [1] 3098
This represents the number of municipalities where dengue was confirmed between 2007 and 2018.
Setting all the predictor variables, in order to capture the environmental and socioeconomic signature from occurrences. set the directory where the predictor variables were saved.
Those were all the variables that originally entered the models: First the Worldclim variables (without the “quarter” variables) and:
plot(var[[1:11]])
plot(var[[12:19]])
Now, a quick look at the occurrence points inside the variables limits:
plot(var[[1]], col='gray')+points(dng2)
## integer(0)
Before choosing variables, we should have a quick check in the variables colinearity to see how they correlate to each other.
var.da <- var %>%
raster::values() %>%
na.omit
corr <- cor(var.da, method = "spearman")
corrplot:: corrplot.mixed(corr, tl.srt = 45, mar = c(3, 0.5, 2, 1))
Some very interesting correlations.
Now we going choose the smallest number of variables as possible, in order to raise predictive power:
##First the environmental variables -Worldclim’s variables: bio 01 (annual mean temperature) and bio 12 (annual precipitation)
-Deforestation between 2009 and 2018 (Mapbiomas- https://mapbiomas.org/) #deforestation was highly correlated with forest cover so we decided to keep only deforestation.
##Second socioeconomic variables:
-Gross domestic product (Kummu et al 2018 https://doi.org/10.1038/sdata.2018.4) #was highly correlated with population density,
names(var)
## [1] "bio01" "bio02" "bio03" "bio04"
## [5] "bio05" "bio06" "bio07" "bio12"
## [9] "bio13" "bio14" "bio15" "GDP"
## [13] "Forest_cover_16" "Deforestation" "Pop_density" "Gini"
## [17] "IDH" "Canned_water" "No_toilet"
var_dng3<-var[[c(1,8,12,14,16, 19)]]
names(var_dng3)
## [1] "bio01" "bio12" "GDP" "Deforestation"
## [5] "Gini" "No_toilet"
plot(var_dng3)
var.da_dng3 <- var_dng3 %>%
raster::values() %>%
na.omit
corr <- cor(var.da_dng3, method = "spearman") #
corrplot::corrplot.mixed(corr, tl.srt = 45, mar = c(3, 0.5, 2, 1))
Sorting 500 municipalities for CV
mun32<- read.csv("municipios.csv")#municipalities coordinates from Brazil
mun34 <- mun32[, c(3,4)]
mun35<-cbind(mun34[,2], mun34[,1])#adjusting for ENMeval
set.seed(31)
bg_mun<-mun35[sample(nrow(mun35), 500), ]#sorting 500 muncipalities randomly
colnames(bg_mun)<-c("lon", "lat")
plot(var_dng3[[1]])+points(bg_mun)
## integer(0)
#deviding the occurrences in 4 blocks using ENMeval
block_dng2<-get.block(dng2, bg_mun)
str(block_dng2)
## List of 2
## $ occs.grp: num [1:3098] 1 1 1 1 1 1 1 1 1 1 ...
## $ bg.grp : num [1:500] 3 3 3 3 3 3 3 3 3 3 ...
plot(var_dng3[[1]],col='gray')+ points(dng2, pch=21, bg=block_dng2$occ.grp)
## integer(0)
Dividing the occurrences in four different blocks so we can run a cross validation H-block.
First preparing the data for model run
dng3<-as.data.frame(cbind(dng2$species, dng2$lon, dng2$lat))
head(dng3)
## V1 V2 V3
## 1 1 -61.9953 -11.92830
## 2 1 -63.0325 -9.90571
## 3 1 -60.5520 -13.49450
## 4 1 -61.4562 -11.43430
## 5 1 -60.8168 -13.18700
## 6 1 -60.5454 -13.11740
colnames(dng3)<-c("sp_name", "longitude", "latitude")
dng3$sp_name<-"dengue_fever"
dng4<-dng3[,-1]
head(dng2)
## species
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
set.seed(31)#seting aleatory number for reproducibility.
d_dng<-sdmData(species~., dng2, predictors=var_dng3, bg=block_dng2$bg.grp)
#building the model In order to build the Ecological niche models we used the sdm package from Naimi et al. (2016; doi: 10.1111/ecog.01881) We used different algorithms like glm, maxent, bioclim, random forest and svm to build the models.
set.seed(31)#we set a seed to certify that random number generations will be the same thus, we can reproduce the procedure.
m_dng<-sdm(species~., d_dng,methods=c('glm','maxent','bioclim', 'rf', 'svm'),#algorithms used in the models
replication='cv', #setting method of replication for cross validation
test.p=30, #set 30% of records for test
cv.folds=11, #number of folds in cross validation
n=5,#number of recplicates per algorithm
parallelSettings=list(ncore=4, method='parallel')) #replication - specify which replication method should be used- cross validation and subsampling
## Carregando pacotes exigidos: parallel
m_dng
## class : sdmModels
## ========================================================
## number of species : 1
## number of modelling methods : 5
## names of modelling methods : glm, maxent, bioclim, rf, svm
## replicate.methods (data partitioning) : cross_validation
## number of replicates (each method) : 5
## toral number of replicates per model : 15 (per species)
## ------------------------------------------
## model run success percentage (per species) :
## ------------------------------------------
## method species
## ----------------------
## glm : 100 %
## maxent : 100 %
## bioclim : 100 %
## rf : 100 %
## svm : 100 %
##
## ###################################################################
## model Mean performance (per species), using test dataset (generated using partitioning):
## -------------------------------------------------------------------------------
##
## ## species : species
## =========================
##
## methods : AUC | COR | TSS | Deviance
## -------------------------------------------------------------------------
## glm : 0.79 | 0 | 0.62 | 0.09
## maxent : NaN | NaN | NaN | NaN
## bioclim : 0.61 | 0.01 | 0.44 | 7.41
## rf : 0.82 | 0.21 | 0.66 | 0.01
## svm : 0.53 | 0.03 | 0.53 | 0.02
Since the model is done let’s have a look in the individual predictions.
dng_p1<-predict(m_dng, var_dng3, filename='dng_sdm5.img', overwrite=TRUE)
plot(dng_p1)
Ploting at least one prediction of each algorithm to have a closer look…
plot(dng_p1[[c(1,7,15,16,21,30,31,37,45,46,53,60,61,68,75)]])
#have a look in all algorithms
We can check if the prediction is informative
dng_bin1 <- bin_model(dng_p1[[1:15]], dng2,percent = 10)#percent is the threshold
dng_bin2 <- bin_model(dng_p1[[16:30]],dng2,percent = 10)
dng_bin3 <- bin_model(dng_p1[[31:45]],dng2,percent = 10)
dng_bin4 <- bin_model(dng_p1[[46:60]],dng2,percent = 10)
dng_bin5 <- bin_model(dng_p1[[61:75]],dng2,percent = 10)
#and now cheking each prediction
plot(dng_bin1)#glm
plot(dng_bin2)#maxent
plot(dng_bin3)#bioclim
plot(dng_bin4)#Random Forest
plot(dng_bin5)#SVM
It’s a bit dificult to look so let’s plot 2 of each algorithm
dng_binary<-raster::stack(dng_bin1,
dng_bin2,
dng_bin3,
dng_bin4,
dng_bin5)
plot(dng_binary[[c(1,7,15,16,21,30,31,37,45,46,53,60,61,68,75)]])
#have a look in all algorithms
Bioclim and Maxent did not bring any information after binarization (Maxent did not bring information before binarization either).
So we decided to re-run dengue models without these two algorithms. ##Model Re-run Just repeat the steps so far without Maxent and Bioclim
set.seed(31)
m_dng<-sdm(species~., d_dng,methods=c('glm', 'rf', 'svm'),
replication='cv',
test.p=30,
cv.folds=11,
n=5,
parallelSettings=list(ncore=4, method='parallel'))
dng_p1<-predict(m_dng, var_dng3, filename='dng_sdm4.img', overwrite=TRUE)
dng_bin1 <- bin_model(dng_p1[[1:15]], dng2,percent = 10)#glm
dng_bin2 <- bin_model(dng_p1[[16:30]],dng2,percent = 10)#random forest
dng_bin3 <- bin_model(dng_p1[[31:45]],dng2,percent = 10)#svm
dng_binary<-raster::stack(dng_bin1,
dng_bin2,
dng_bin3)
#Now the individual models are ready for ensemble ##Building the socioeconomic + environemnt ensemble model
dng_p2_en2 <- ensemble(m_dng, dng_binary, filename='en_dng_sdm_binary35.tif', setting=list(method='unweighted'),)
##
## ......... the Raster object is used as the predicted probabilities...
#'method='unweighted' - uses a ensemble strategies withouth weights. In this way, the suitability values from the models are means calculated from the different individual models.
plot(dng_p2_en2, col=matlab.like2(100))
Plotting it with occurrences
plot(dng_p2_en2, col=matlab.like2(100))+points(dng2)
## integer(0)
Plotting in a good looking pallete of colors
dng_viridis<-as.data.frame(dng_p2_en2, xy=T)
head(dng_viridis)
## x y en_dng_sdm_binary35
## 1 -73.91667 5.25 NA
## 2 -73.75000 5.25 NA
## 3 -73.58333 5.25 NA
## 4 -73.41667 5.25 NA
## 5 -73.25000 5.25 NA
## 6 -73.08333 5.25 NA
dng_test1<-ggplot() +
geom_raster(data = subset(dng_viridis, !is.na(en_dng_sdm_binary35)), aes(x=x, y=y, fill=en_dng_sdm_binary35),
alpha=0.8)+
scale_alpha(range = c(0.15, 0.65), guide = "none") +
scale_fill_viridis_c()+
theme_bw()+
labs(x=NULL, y=NULL, fill="Disease risk (suitability)")+
coord_quickmap()
dng_test1
dng_test11<-dng_test1+theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
dng_test11+geom_point(data = dng3,
aes(x = longitude,
y = latitude,
color=sp_name),
shape=1,
size = 2)+
scale_shape_manual(values = 1) +
scale_color_manual(values = "#FC4E07")+
labs(color="Disease Occurrences", shape=NULL)
#Partial Roc evaluation Now performing the partial Roc, with pROC
function from ‘ntbox’ package (Osorio-Olvera et al. 2020; https://doi.org/10.1111/2041-210X.13452) The partial ROC
metrics were applied using ntbox R package to 50% subset of presence
data, based on 1000 iterations and using a 5% error threshold (all
default arguments).
pROC_dng <- pROC(continuous_mod=dng_p2_en2,
test_data=dng4,
n_iter=1000,
E_percent=5,
boost_percent=50,
parallel=FALSE)
pROC_dng$pROC_summary
## Mean_AUC Mean_pAUC_ratio_at_5% P_value
## 0.7806499 1.2742081 0.0000000
Now ploting the relative variable importance using the function ‘getVarImp’ from sdm package (Naimi et al. 2016) from the model. This informs the variables that are most important for defining the model.
plot(getVarImp(m_dng))
##
## The variable importance for all the models are combined (averaged)...
Response curves area other way to explore variable importance, in order to visualize how each variable responds to the response variable, in this case disease suitability. (this procedure is implemented in sdm package, following Elith et al. 2005; https://doi.org/10.1016/j.ecolmodel.2004.12.007)
rcurve(m_dng)
## The id argument is not specified; The modelIDs of 45 successfully fitted models are assigned to id...!
In this method response curves are strange since the models were
binarized in order to promote interpretation. we can have a look each
algorithm by time
rcurve(m_dng, id=1:15)#glm
rcurve(m_dng, id=16:30)#random forest
rcurve(m_dng, id=31:45)#SVM
Selecting only environment relevant variables
names(var)
## [1] "bio01" "bio02" "bio03" "bio04"
## [5] "bio05" "bio06" "bio07" "bio12"
## [9] "bio13" "bio14" "bio15" "GDP"
## [13] "Forest_cover_16" "Deforestation" "Pop_density" "Gini"
## [17] "IDH" "Canned_water" "No_toilet"
var_clim<-var[[c(1:11,13)]]
plot(var_clim)
names(var_clim)
## [1] "bio01" "bio02" "bio03" "bio04"
## [5] "bio05" "bio06" "bio07" "bio12"
## [9] "bio13" "bio14" "bio15" "Forest_cover_16"
var.da_clim <- var_clim %>%
raster::values() %>%
na.omit
corr <- cor(var.da_clim, method = "spearman")
corrplot:: corrplot.mixed(corr, tl.srt = 45, mar = c(3, 0.5, 2, 1))
Now we will choose the variables to make a collection of just
environmental variables. We sought to recreate a variable selection
process that was as close to the traditional ENM approach as possible.
To identify less correlated climatic variables, we utilized the
‘vifstep’ function from the sdm R package (61). We utilized vifstep to
select them, removing variables with an overall correlation score of
more than ‘2’, which is a conservative number for colinearity. As a
result, there is no colinearity among the variables, and the
environmental models continue to use a set of variables that are similar
to socioeconomic models (which is important since we want to compare the
performance between them).
ex_dng_clim<-raster::extract(var_clim, dng2) #extracting records from variables among occurrences
v_dng_clim<-vifstep(ex_dng_clim, th = 2)#calculating
var_dng_clim<-exclude(var, v_dng_clim)
var.da_dng_clim <- var_dng_clim %>%
raster::values() %>%
na.omit
corr <- cor(var.da_dng_clim, method = "spearman")
corrplot:: corrplot.mixed(corr, tl.srt = 45, mar = c(3, 0.5, 2, 1))
#generating the environment only models Here we will generate a new ENM using the same occurrence data and the new set of variables (var_dng_clim), performing the same cross validation as before. Furthermore, we will use the same set of algorithms:
set.seed(31)
d_dng_clim<-sdmData(species~.,dng2, predictors = var_dng_clim, bg=block_dng2$bg.grp)
## Warning in if (nrow(x) > size) {: a condição tem comprimento > 1 e somente o
## primeiro elemento será usado
m_dng_clim<-sdm(species~., d_dng_clim, methods=c('maxent','bioclim','glm', 'rf', 'svm'),
replication='cv',
test.p=30,
cv.folds=11,
n=5,
parallelSettings=list(ncore=4, method='parallel'))
Now repeating the same procedure of binarization and checking how individual algorithms performed:
dng_p1_clim<-predict(m_dng_clim, var_dng_clim, filename='model_dng_tegu_comcv.img', overwrite=TRUE)
plot(dng_p1_clim)
dng_clim_bin1 <- bin_model(dng_p1_clim[[1:15]], dng2,percent = 10)
dng_clim_bin2 <- bin_model(dng_p1_clim[[16:30]],dng2,percent = 10)
dng_clim_bin3 <- bin_model(dng_p1_clim[[31:45]],dng2,percent = 10)
dng_clim_bin4 <- bin_model(dng_p1_clim[[46:60]],dng2,percent = 10)
dng_clim_bin5 <- bin_model(dng_p1_clim[[61:75]],dng2,percent = 10)
dng_clim_binary<-raster::stack(dng_clim_bin1,
dng_clim_bin2,
dng_clim_bin3,
dng_clim_bin4,
dng_clim_bin5)
plot(dng_clim_binary[[c(1,7,15,16,21,30,31,37,45,46,53,60,61,68,75)]])
Again Bioclim and Maxent turn up to be non informative models after binarization. So we will repeat the process to generate the model without these algorithms.
set.seed(31)
m_dng_clim<-sdm(species~., d_dng_clim, methods=c('glm', 'rf', 'svm'),
replication='cv',
test.p=30,
cv.folds=11,
n=5,
parallelSettings=list(ncore=4, method='parallel'))
dng_p1_clim<-predict(m_dng_clim, var_dng_clim, filename='model_dng_cv.img', overwrite=TRUE)
dng_clim_bin1 <- bin_model(dng_p1_clim[[1:15]], dng2,percent = 10)
dng_clim_bin2 <- bin_model(dng_p1_clim[[16:30]],dng2,percent = 10)
dng_clim_bin3 <- bin_model(dng_p1_clim[[31:45]],dng2,percent = 10)
dng_clim_binary<-raster::stack(dng_clim_bin1,
dng_clim_bin2,
dng_clim_bin3)
Peforming again the ensemble with the mean between the remaining algorithms.
dng_clim_en2 <- ensemble(m_dng_clim, dng_clim_binary, filename='en_dng_clim_binary38.tif', setting=list(method='unweighted'), )#setting the method for unweighted so ensemble will be the mean among other models
##
## ......... the Raster object is used as the predicted probabilities...
plot(dng_clim_en2, col=matlab.like2(100))
plot(dng_clim_en2, col=matlab.like2(100))+points(dng4)
## integer(0)
Plotting the same with a different pallet of colours:
dng_clim_viridis<-as.data.frame(dng_clim_en2, xy=T)
dng_test2<-ggplot() +
geom_raster(data = subset(dng_clim_viridis, !is.na(en_dng_clim_binary38)), aes(x=x, y=y, fill=en_dng_clim_binary38),
alpha=0.8)+
scale_alpha(range = c(0.15, 0.65), guide = "none") +
scale_fill_viridis_c()+
theme_bw()+
labs(x=NULL, y=NULL, fill="Disease risk (suitability)")+
coord_quickmap()
plot(dng_test2)
dng_test22<-dng_test2+theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
dng_test22
Now adding the occurrences as small red circles
dng_test22+geom_point(data = dng3,
aes(x = longitude,
y = latitude,
color=sp_name),
shape=1,
size = 2)+
scale_shape_manual(values = 1) +
scale_color_manual(values = "#FC4E07")+
labs(color="Disease Occurrences", shape=NULL)
## pROC metric for model comparasion
pROC_dng_clim <- pROC(continuous_mod=dng_clim_en2,
test_data = dng4,
n_iter=1000,E_percent=5,
boost_percent=50,
parallel=FALSE)
pROC_dng_clim$pROC_summary
## Mean_AUC Mean_pAUC_ratio_at_5% P_value
## 0.6972012 1.1861873 0.0000000
Now getting the response curves
rcurve(m_dng_clim)
## The id argument is not specified; The modelIDs of 45 successfully fitted models are assigned to id...!
Now to get the relative variable importance, the variable contribution
for the dengue environment only model:
plot(getVarImp(m_dng_clim))
##
## The variable importance for all the models are combined (averaged)...
This is the end of the example with dengue fever. Any adtional doubts please sende email to arthurama.magalhaes@gmail.com Thank you in advance, Arthur